{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用Tensorflow构造隐语义模型的推荐系统\n",
    "\n",
    "3900 个电影\n",
    "6,040个用户\n",
    "\n",
    "数据简介: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt\n",
    "\n",
    "数据下载地址：http://files.grouplens.org/datasets/movielens/ml-1m.zip\n",
    "### http://www.lfd.uci.edu/~gohlke/pythonlibs/#tensorflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Imports for data io operations\n",
    "from collections import deque\n",
    "from six import next\n",
    "import readers\n",
    "\n",
    "# Main imports for training\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "\n",
    "# Evaluate train times per epoch\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Constant seed for replicating training results\n",
    "np.random.seed(42)\n",
    "\n",
    "u_num = 6040 # Number of users in the dataset\n",
    "i_num = 3952 # Number of movies in the dataset\n",
    "\n",
    "batch_size = 1000 # Number of samples per batch\n",
    "dims = 5          # Dimensions of the data, 15\n",
    "max_epochs = 50   # Number of times the network sees all the training data\n",
    "\n",
    "# Device used for all computations\n",
    "place_device = \"/cpu:0\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_data():\n",
    "    # Reads file using the demiliter :: form the ratings file\n",
    "    # Columns are user ID, item ID, rating, and timestamp\n",
    "    # Sample data - 3::1196::4::978297539\n",
    "    df = readers.read_file(\"./ml-1m/ratings.dat\", sep=\"::\")\n",
    "    rows = len(df)\n",
    "    # Purely integer-location based indexing for selection by position\n",
    "    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)\n",
    "    # Separate data into train and test, 90% for train and 10% for test\n",
    "    split_index = int(rows * 0.9)\n",
    "    # Use indices to separate the data\n",
    "    df_train = df[0:split_index]\n",
    "    df_test = df[split_index:].reset_index(drop=True)\n",
    "    \n",
    "    return df_train, df_test\n",
    "\n",
    "def clip(x):\n",
    "    return np.clip(x, 1.0, 5.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def model(user_batch, item_batch, user_num, item_num, dim=5, device=\"/cpu:0\"):\n",
    "    with tf.device(\"/cpu:0\"):\n",
    "        with tf.variable_scope('lsi',reuse=True):\n",
    "            # Using a global bias term\n",
    "            bias_global = tf.get_variable(\"bias_global\", shape=[])\n",
    "            # User and item bias variables\n",
    "            # get_variable: Prefixes the name with the current variable scope \n",
    "            # and performs reuse checks.\n",
    "            w_bias_user = tf.get_variable(\"embd_bias_user\", shape=[user_num])\n",
    "            w_bias_item = tf.get_variable(\"embd_bias_item\", shape=[item_num])\n",
    "            # embedding_lookup: Looks up 'ids' in a list of embedding tensors\n",
    "            # Bias embeddings for user and items, given a batch\n",
    "            bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name=\"bias_user\")\n",
    "            bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name=\"bias_item\")\n",
    "            # User and item weight variables\n",
    "            w_user = tf.get_variable(\"embd_user\", shape=[user_num, dim],\n",
    "                                     initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
    "            w_item = tf.get_variable(\"embd_item\", shape=[item_num, dim],\n",
    "                                     initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
    "            # Weight embeddings for user and items, given a batch\n",
    "            embd_user = tf.nn.embedding_lookup(w_user, user_batch, name=\"embedding_user\")\n",
    "            embd_item = tf.nn.embedding_lookup(w_item, item_batch, name=\"embedding_item\")\n",
    "    \n",
    "    with tf.device(device):\n",
    "        # reduce_sum: Computes the sum of elements across dimensions of a tensor\n",
    "        infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)\n",
    "        infer = tf.add(infer, bias_global)\n",
    "        infer = tf.add(infer, bias_user)\n",
    "        infer = tf.add(infer, bias_item, name=\"svd_inference\")\n",
    "        # l2_loss: Computes half the L2 norm of a tensor without the sqrt\n",
    "        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), \n",
    "                             name=\"svd_regularizer\")\n",
    "    return infer, regularizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device=\"/cpu:0\"):\n",
    "    with tf.device(device):\n",
    "        # Use L2 loss to compute penalty\n",
    "        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))\n",
    "        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name=\"l2\")\n",
    "        cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))\n",
    "        # 'Follow the Regularized Leader' optimizer\n",
    "        train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)\n",
    "    return cost, train_op"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of train samples 900188, test samples 100021, samples per batch 900\n"
     ]
    }
   ],
   "source": [
    "# Read data from ratings file to build a TF model\n",
    "df_train, df_test = get_data()\n",
    "\n",
    "samples_per_batch = len(df_train) // batch_size\n",
    "print(\"Number of train samples %d, test samples %d, samples per batch %d\" % \n",
    "      (len(df_train), len(df_test), samples_per_batch))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    1463\n",
      "1    1260\n",
      "2    1205\n",
      "3    4657\n",
      "4    5604\n",
      "Name: user, dtype: int32\n",
      "0    4138\n",
      "1    5233\n",
      "2    1419\n",
      "3    2019\n",
      "4    2786\n",
      "Name: user, dtype: int32\n"
     ]
    }
   ],
   "source": [
    "# Peeking at the top 5 user values\n",
    "print(df_train[\"user\"].head()) \n",
    "print(df_test[\"user\"].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    2114\n",
      "1    1209\n",
      "2     328\n",
      "3     615\n",
      "4    2500\n",
      "Name: item, dtype: int32\n",
      "0    1403\n",
      "1    3741\n",
      "2    1731\n",
      "3    2114\n",
      "4    3104\n",
      "Name: item, dtype: int32\n"
     ]
    }
   ],
   "source": [
    "# Peeking at the top 5 item values\n",
    "print(df_train[\"item\"].head())\n",
    "print(df_test[\"item\"].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    2.0\n",
      "1    5.0\n",
      "2    4.0\n",
      "3    2.0\n",
      "4    4.0\n",
      "Name: rate, dtype: float32\n",
      "0    3.0\n",
      "1    5.0\n",
      "2    5.0\n",
      "3    3.0\n",
      "4    3.0\n",
      "Name: rate, dtype: float32\n"
     ]
    }
   ],
   "source": [
    "# Peeking at the top 5 rate values\n",
    "print(df_train[\"rate\"].head())\n",
    "print(df_test[\"rate\"].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Using a shuffle iterator to generate random batches, for training\n",
    "iter_train = readers.ShuffleIterator([df_train[\"user\"],\n",
    "                                     df_train[\"item\"],\n",
    "                                     df_train[\"rate\"]],\n",
    "                                     batch_size=batch_size)\n",
    "\n",
    "# Sequentially generate one-epoch batches, for testing\n",
    "iter_test = readers.OneEpochIterator([df_test[\"user\"],\n",
    "                                     df_test[\"item\"],\n",
    "                                     df_test[\"rate\"]],\n",
    "                                     batch_size=-1)\n",
    "\n",
    "user_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_user\")\n",
    "item_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_item\")\n",
    "rate_batch = tf.placeholder(tf.float32, shape=[None])\n",
    "\n",
    "infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)\n",
    "_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch\tTrain Error\tVal Error\tElapsed Time\n",
      "00\t2.817\t\t1.114\t\t0.034 secs\n",
      "01\t1.047\t\t1.003\t\t1.028 secs\n",
      "02\t0.982\t\t0.968\t\t1.038 secs\n",
      "03\t0.955\t\t0.950\t\t1.011 secs\n",
      "04\t0.941\t\t0.940\t\t1.005 secs\n",
      "05\t0.932\t\t0.934\t\t1.092 secs\n",
      "06\t0.927\t\t0.929\t\t1.317 secs\n",
      "07\t0.923\t\t0.926\t\t1.175 secs\n",
      "08\t0.918\t\t0.923\t\t1.117 secs\n",
      "09\t0.916\t\t0.921\t\t1.013 secs\n",
      "10\t0.914\t\t0.919\t\t1.019 secs\n",
      "11\t0.911\t\t0.918\t\t1.032 secs\n",
      "12\t0.910\t\t0.917\t\t1.037 secs\n",
      "13\t0.909\t\t0.917\t\t1.235 secs\n",
      "14\t0.908\t\t0.915\t\t1.156 secs\n",
      "15\t0.907\t\t0.914\t\t1.420 secs\n",
      "16\t0.907\t\t0.914\t\t1.324 secs\n",
      "17\t0.905\t\t0.914\t\t1.134 secs\n",
      "18\t0.904\t\t0.914\t\t1.020 secs\n",
      "19\t0.904\t\t0.913\t\t1.022 secs\n",
      "20\t0.904\t\t0.913\t\t1.065 secs\n",
      "21\t0.903\t\t0.912\t\t1.005 secs\n",
      "22\t0.902\t\t0.912\t\t1.006 secs\n",
      "23\t0.903\t\t0.911\t\t1.020 secs\n",
      "24\t0.902\t\t0.911\t\t1.036 secs\n",
      "25\t0.901\t\t0.911\t\t1.071 secs\n",
      "26\t0.902\t\t0.912\t\t1.014 secs\n",
      "27\t0.900\t\t0.911\t\t0.994 secs\n",
      "28\t0.901\t\t0.911\t\t1.014 secs\n",
      "29\t0.902\t\t0.910\t\t1.007 secs\n",
      "30\t0.902\t\t0.911\t\t1.046 secs\n",
      "31\t0.901\t\t0.910\t\t0.996 secs\n",
      "32\t0.899\t\t0.910\t\t0.996 secs\n",
      "33\t0.900\t\t0.910\t\t1.010 secs\n",
      "34\t0.899\t\t0.911\t\t1.010 secs\n",
      "35\t0.900\t\t0.910\t\t1.037 secs\n",
      "36\t0.899\t\t0.910\t\t0.999 secs\n",
      "37\t0.900\t\t0.911\t\t0.990 secs\n",
      "38\t0.900\t\t0.910\t\t1.010 secs\n",
      "39\t0.900\t\t0.910\t\t1.009 secs\n",
      "40\t0.899\t\t0.910\t\t1.040 secs\n",
      "41\t0.900\t\t0.911\t\t0.994 secs\n",
      "42\t0.900\t\t0.910\t\t0.996 secs\n",
      "43\t0.898\t\t0.911\t\t1.013 secs\n",
      "44\t0.899\t\t0.910\t\t1.013 secs\n",
      "45\t0.899\t\t0.910\t\t1.036 secs\n",
      "46\t0.899\t\t0.910\t\t0.999 secs\n",
      "47\t0.897\t\t0.909\t\t0.993 secs\n",
      "48\t0.899\t\t0.910\t\t1.012 secs\n",
      "49\t0.900\t\t0.910\t\t1.007 secs\n"
     ]
    }
   ],
   "source": [
    "saver = tf.train.Saver()\n",
    "init_op = tf.global_variables_initializer()\n",
    "\n",
    "with tf.Session() as sess:\n",
    "    sess.run(init_op)\n",
    "    print(\"%s\\t%s\\t%s\\t%s\" % (\"Epoch\", \"Train Error\", \"Val Error\", \"Elapsed Time\"))\n",
    "    errors = deque(maxlen=samples_per_batch)\n",
    "    start = time.time()\n",
    "    for i in range(max_epochs * samples_per_batch):\n",
    "        users, items, rates = next(iter_train)\n",
    "        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,\n",
    "                                                               item_batch: items,\n",
    "        mul                                                       rate_batch: rates})\n",
    "        pred_batch = clip(pred_batch)\n",
    "        errors.append(np.power(pred_batch - rates, 2))\n",
    "        if i % samples_per_batch == 0:\n",
    "            train_err = np.sqrt(np.mean(errors))\n",
    "            test_err2 = np.array([])\n",
    "            for users, items, rates in iter_test:\n",
    "                pred_batch = sess.run(infer, feed_dict={user_batch: users,\n",
    "                                                        item_batch: items})\n",
    "                pred_batch = clip(pred_batch)\n",
    "                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))\n",
    "            end = time.time()\n",
    "            \n",
    "            print(\"%02d\\t%.3f\\t\\t%.3f\\t\\t%.3f secs\" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))\n",
    "            start = end\n",
    "\n",
    "    saver.save(sess, './save/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![alt text](TrainValError.png \"Train / Validation Error vs. Epoch\")"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
